{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第06课：动手实战基于 ML 的中文短文本分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import jieba\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#加载停用词\n",
    "stopwords=pd.read_csv('stopwords.txt',index_col=False,quoting=3,sep=\"\\t\",names=['stopword'], encoding='utf-8')\n",
    "stopwords=stopwords['stopword'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['!', '\"', '#', ..., '450', '22549', '22544'], dtype=object)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#加载语料\n",
    "laogong_df = pd.read_csv('../data/06/beilaogongda.csv', encoding='utf-8', sep=',')\n",
    "laopo_df = pd.read_csv('../data/06/beilaogongda.csv', encoding='utf-8', sep=',')\n",
    "erzi_df = pd.read_csv('../data/06/beierzida.csv', encoding='utf-8', sep=',')\n",
    "nver_df = pd.read_csv('../data/06/beinverda.csv', encoding='utf-8', sep=',')\n",
    "#删除语料的nan行\n",
    "laogong_df.dropna(inplace=True)\n",
    "laopo_df.dropna(inplace=True)\n",
    "erzi_df.dropna(inplace=True)\n",
    "nver_df.dropna(inplace=True)\n",
    "#转换\n",
    "laogong = laogong_df.segment.values.tolist()\n",
    "laopo = laopo_df.segment.values.tolist()\n",
    "erzi = erzi_df.segment.values.tolist()\n",
    "nver = nver_df.segment.values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#定义分词和打标签函数preprocess_text\n",
    "#参数content_lines即为上面转换的list\n",
    "#参数sentences是定义的空list，用来储存打标签之后的数据\n",
    "#参数category 是类型标签\n",
    "def preprocess_text(content_lines, sentences, category):\n",
    "    for line in content_lines:\n",
    "        try:\n",
    "            segs=jieba.lcut(line)\n",
    "            segs = [v for v in segs if not str(v).isdigit()]#去数字\n",
    "            segs = list(filter(lambda x:x.strip(), segs))   #去左右空格\n",
    "            segs = list(filter(lambda x:len(x)>1, segs)) #长度为1的字符\n",
    "            segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词\n",
    "            sentences.append((\" \".join(segs), category))# 打标签\n",
    "        except Exception:\n",
    "            print(line)\n",
    "            continue "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/yw/k7z_d_3567g16ss9plk47x9w0000gn/T/jieba.cache\n",
      "Loading model cost 0.661 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "sentences = []\n",
    "preprocess_text(laogong, sentences,0)\n",
    "preprocess_text(laopo, sentences, 1)\n",
    "preprocess_text(erzi, sentences, 2)\n",
    "preprocess_text(nver, sentences, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "random.shuffle(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "老公 人伤 通知 救护车 致电 持械 情况不明 民警 携带 防护 装备 1\n",
      "报警 女儿 无需 民警 到场 3\n",
      "报警 人称 女儿 持械 人伤 情况 不详 民警 到场 3\n",
      "报警 老公 民警 到场 0\n",
      "报警 丈夫 持械 人伤 无需 救护 民警 到场 民警 防护 设备 1\n",
      "报警 老公 棍子 人伤 民警 携带 防护 装备 到场 1\n",
      "报警 人称 儿子 民警 到场 2\n",
      "报警 女儿 人伤 民警 到场 携带 防护 装备 3\n",
      "报警 老公 民警 到场 1\n",
      "报警 儿子 民警 到场 2\n"
     ]
    }
   ],
   "source": [
    "for sentence in sentences[:10]:\n",
    "        print(sentence[0], sentence[1])  #下标0是词列表，1是标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vec = CountVectorizer(\n",
    "    analyzer='word', # tokenise by character ngrams\n",
    "    max_features=4000,  # keep the most common 1000 ngrams\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "x, y = zip(*sentences)\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=4000, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "        tokenizer=None, vocabulary=None)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec.fit(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "classifier = MultinomialNB()\n",
    "classifier.fit(vec.transform(x_train), y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6357308584686775\n"
     ]
    }
   ],
   "source": [
    "print(classifier.score(vec.transform(x_test), y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pre = classifier.predict(vec.transform(x_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.568445475638051\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vec = CountVectorizer(\n",
    "    analyzer='word', # tokenise by character ngrams\n",
    "    ngram_range=(1,4),  # use ngrams of size 1 and 2\n",
    "    max_features=20000,  # keep the most common 1000 ngrams\n",
    ")\n",
    "vec.fit(x_train)\n",
    "#用朴素贝叶斯算法进行模型训练\n",
    "classifier = MultinomialNB()\n",
    "classifier.fit(vec.transform(x_train), y_train)\n",
    "#对结果进行评分\n",
    "print(classifier.score(vec.transform(x_test), y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5522041763341067\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "svm = SVC(kernel='linear')\n",
    "svm.fit(vec.transform(x_train), y_train)\n",
    "print(svm.score(vec.transform(x_test), y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import xgboost as xgb  \n",
    "# from sklearn.model_selection import StratifiedKFold  \n",
    "# import numpy as np\n",
    "# # xgb矩阵赋值  \n",
    "# xgb_train = xgb.DMatrix(vec.transform(x_train), label=y_train)  \n",
    "# xgb_test = xgb.DMatrix(vec.transform(x_test)) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "  params = {  \n",
    "            'booster': 'gbtree',     #使用gbtree\n",
    "            'objective': 'multi:softmax',  # 多分类的问题、  \n",
    "            # 'objective': 'multi:softprob',   # 多分类概率  \n",
    "            #'objective': 'binary:logistic',  #二分类\n",
    "            'eval_metric': 'merror',   #logloss\n",
    "            'num_class': 4,  # 类别数，与 multisoftmax 并用  \n",
    "            'gamma': 0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。  \n",
    "            'max_depth': 8,  # 构建树的深度，越大越容易过拟合  \n",
    "            'alpha': 0,   # L1正则化系数  \n",
    "            'lambda': 10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。  \n",
    "            'subsample': 0.7,  # 随机采样训练样本  \n",
    "            'colsample_bytree': 0.5,  # 生成树时进行的列采样  \n",
    "            'min_child_weight': 3,  \n",
    "            # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言  \n",
    "            # 假设 h 在 0.01 附近，min_child_weight 为 1 叶子节点中最少需要包含 100 个样本。  \n",
    "            'silent': 0,  # 设置成1则没有运行信息输出，最好是设置为0.  \n",
    "            'eta': 0.03,  # 如同学习率  \n",
    "            'seed': 1000,  \n",
    "            'nthread': -1,  # cpu 线程数  \n",
    "            'missing': 1 \n",
    "        }  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
